library(tidyverse)
library(magrittr)
library(ngsReports)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(AnnotationHub)
library(ensembldb)
library(edgeR)
library(corrplot)
library(cqn)
library(DT)
library(Gviz)
library(AnnotationFilter)
library(Rsamtools)
library(kmer)
library(furrr)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
plan(multisession)
# options(future.plan = "multisession")
ah_Dr <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb_Dr <- ah_Dr[["AH74989"]]
trEns_Dr <- transcripts(ensDb_Dr) %>%
mcols() %>%
as_tibble()
trLen_Dr <- exonsBy(ensDb_Dr, "tx") %>%
width() %>%
vapply(sum, integer(1))
geneGcLen_Dr <- trLen_Dr %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Dr) %>%
group_by(gene_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
trGcLen_Dr <- trLen_Dr %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Dr) %>%
group_by(tx_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
genesGR_Dr <- genes(ensDb_Dr)
mcols(genesGR_Dr) <- mcols(genesGR_Dr)[c("gene_id", "gene_name",
"gene_biotype", "entrezid")]
txGR_Dr <- transcripts(ensDb_Dr)
mcols(txGR_Dr) <- mcols(txGR_Dr)[c("tx_id", "tx_name",
"tx_biotype", "tx_id_version", "gene_id")]
An EnsDb object was obtained for Ensembl release 98 using the AnnotationHub package. This provided the GC content and length for every gene and transcript in the release. For zebrafish, this consists of 37241 genes and 65905 transcripts.
This is a total RNA dataset generated from a 3-way comparison of WT zebrafish (Danio rerio) with heterozygous mutants (psen2S4Ter/+) and homozygous mutants (psen2S4Ter/S4Ter). Previous analysis of this dataset identified the possibility of incomplete ribosomal RNA (rRNA) removal (previous analysis).This dataset was chosen for an initial inspection as it is expected that rRNA sequences in zebrafish are more divergent from more common model organisms such as mice. Hence, inefficient rRNA depletion by kits that are generally optimised for these common model organisms may produce a stronger signal in zebrafish.
files <- list.files(
path = "../20170327_Psen2S4Ter_RNASeq/data/0_rawData/FastQC",
pattern = "zip",
full.names = TRUE
) %>%
str_subset("R1_fastqc.zip")
samples <- tibble(
sample = str_remove(basename(files), "_fastqc.zip"),
dataset = NA,
organism = NA
) %>%
mutate(
sample = ifelse(
str_detect(sample, "Ps2Ex"), str_remove(sample, "_R1"), sample
),
dataset = ifelse(
str_detect(sample, "Ps2Ex"), "Psen2S4Ter", dataset
),
organism = ifelse(
str_detect(sample, "(SRR213|SRR218|Ps2Ex)"), "zebrafish", organism
)
)
datasets <- samples$dataset %>%
unique()
The following analysis involves 12 samples across 1 dataset(s): Psen2S4Ter.
rawFqc <- files %>%
FastqcDataList()
data <- grepl("Ps2Ex", fqName(rawFqc))
labels <- rawFqc[data] %>%
fqName() %>%
str_remove("_6month_07_07_2016_F3") %>%
str_remove("\\.fastq\\.gz") %>%
str_remove("Ps2Ex3M1_")
rawLib <- plotReadTotals(rawFqc[data]) +
labs(subtitle = "Psen2S4Ter") +
scale_x_discrete(labels = labels)
The library sizes of the unprocessed dataset(s) range between 27,979,654 and 37,144,975 reads.
rawLib
rRNA transcripts are known to have high GC content. Therefore, surveying the GC content of the raw reads serves as a good starting point for detecting incomplete rRNA removal. A spike in GC content at > 70% is expected if this is the case.
plotly::ggplotly(
plotGcContent(
x = rawFqc[data],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
) +
labs(title = "Psen2S4Ter Dataset (D. rerio)") +
theme(legend.position="none")
)
GC content of reads in the dataset. Clear spikes above 70% GC are observed, which is likely due to incomplete rRNA depletion.
The top 30 overrepresented sequences were analysed using blastn and were found to be predominantly rRNA sequences.
getModule(rawFqc, "Overrep") %>%
group_by(Sequence, Possible_Source) %>%
summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>%
arrange(desc(`Highest Percentage`), desc(`Found In`)) %>%
ungroup() %>%
dplyr::slice(1:30) %>%
mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
kable(
align = "llrr",
caption = paste(
"Top", nrow(.),"Overrepresented sequences.",
"The number of samples they were found in is shown,",
"along with the percentage of the most 'contaminated' sample."
)
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| Sequence | Possible_Source | Found In | Highest Percentage |
|---|---|---|---|
| GTGGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGC | No Hit | 12 | 1.72% |
| CCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATCC | No Hit | 12 | 1.32% |
| GGCCCGGCGCACGTCCAGAGTCGCCGCCGCACACCGCAGCGCATCCCCCC | No Hit | 9 | 1.31% |
| CTCCTGAAAAGGTTGTATCCTTTGTTAAAGGGGCTGTACCCTCTTTAACT | No Hit | 11 | 1.11% |
| GGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATC | No Hit | 12 | 1.09% |
| GGGGTGTACGAAGCTGAACTTTTATTCATCTCCCAGACAACCAGCTATTG | No Hit | 12 | 1.07% |
| GGCCCGGCGCACGTCCAGAGTCGCCGCCGCGCACCGCAGCGCATCCCCCC | No Hit | 10 | 1.03% |
| CCTCCTTCAAGTATTGTTTCATGTTACATTTTCGTATATTCTGGGGTAGA | No Hit | 12 | 0.82% |
| CCCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATC | No Hit | 12 | 0.79% |
| GTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATCT | No Hit | 12 | 0.79% |
| GGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCAT | No Hit | 12 | 0.78% |
| CGGGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG | No Hit | 12 | 0.73% |
| GGGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTT | No Hit | 11 | 0.68% |
| CGGGTCGGGTGGGTAGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG | No Hit | 12 | 0.57% |
| CTTAGACGACCTGGTAGTCCAAGGCTCCCCCAGGAGCACCATATCGATAC | No Hit | 11 | 0.54% |
| AGCTGGGGAGATCCGCGAGAAGGGCCCGGCGCACGTCCAGAGTCGCCGCC | No Hit | 11 | 0.53% |
| GGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTTA | No Hit | 10 | 0.53% |
| GGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGACGTGG | No Hit | 10 | 0.50% |
| GGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGAC | No Hit | 11 | 0.45% |
| CCCCCGAACCCTTCCAAGCCGAACCGGAGCCGGTCGCGGCGCACCGCCGA | No Hit | 10 | 0.45% |
| GTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGACG | No Hit | 10 | 0.43% |
| GCCCACTACGACAACGTGTTTTGTAAATTATGATCTTTATTCTCCTGAAA | No Hit | 10 | 0.43% |
| GGGTGTACGAAGCTGAACTTTTATTCATCTCCCAGACAACCAGCTATTGC | No Hit | 11 | 0.43% |
| GCCCCCGAACCCTTCCAAGCCGAACCGGAGCCGGTCGCGGCGCACCGCCG | No Hit | 10 | 0.42% |
| CCCCGAACCCTTCCAAGCCGAACCGGAGCCGGTCGCGGCGCACCGCCGAC | No Hit | 9 | 0.42% |
| GTCTGAAGTCTTGGAGCTCGACTTCCCTACGTTTCGCCTACAAATGTTGC | No Hit | 11 | 0.41% |
| GGGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGA | No Hit | 10 | 0.40% |
| CTCCTTCAAGTATTGTTTCATGTTACATTTTCGTATATTCTGGGGTAGAA | No Hit | 10 | 0.39% |
| GCTGGGGAGATCCGCGAGAAGGGCCCGGCGCACGTCCAGAGTCGCCGCCG | No Hit | 10 | 0.36% |
| CTGGGGAGATCCGCGAGAAGGGCCCGGCGCACGTCCAGAGTCGCCGCCGC | No Hit | 9 | 0.36% |
trimFqc <- list.files(
path = "../20170327_Psen2S4Ter_RNASeq/data/1_trimmedData/FastQC",
pattern = "zip",
full.names = TRUE
) %>%
str_subset("R1_fastqc.zip") %>%
FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
dplyr::rename(Raw = Total_Sequences) %>%
left_join(readTotals(trimFqc), by = "Filename") %>%
dplyr::rename(Trimmed = Total_Sequences) %>%
mutate(
Discarded = 1 - Trimmed/Raw,
Retained = Trimmed / Raw
)
After trimming of adapters between 4.16% and 5.08% of reads were discarded.
Trimmed reads were firstly aligned to rRNA sequences using the BWA-MEM algorithm to calculate the proportion of reads that were of rRNA origin within each sample. BWA-MEM is recommended for high-quality queries of reads ranging from 70bp to 1Mbp as it is faster and more accurate that alternative algorithms BWA-backtrack and BWA-SW.
rRnaProp <- samples %>%
dplyr::filter(dataset == "Psen2S4Ter") %>%
cbind(tibble(proportion = c(0.2291, 0.2504, 0.2615, 0.2260, 0.2215, 0.2128, 0.2468, 0.2284, 0.1366, 0.1188, 0.1892, 0.1608))) %>%
mutate(
sample = str_remove(sample, "_6month_07_07_2016_F3"),
sample = str_remove(sample, "Ps2Ex3M1_")
)
rRnaProp$dataset %<>%
factor(levels = c("Psen2S4Ter"))
rRnaProp %>%
ggplot(aes(sample, proportion)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~dataset, scales = "free_x") +
scale_y_continuous(labels = percent) +
labs(x = "Sample", y = "Percent of Total") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Percentages of each library that align to rRNA sequences with bwa mem.
Following alignment to known rRNA sequences, reads that mapped were removed to create a separate fastq file that did not contain sequences identified as rRNA. These remaining sequences were aligned to the Danio rerio GRCz11 genome (Ensembl release 98) using STAR 2.7.0d and summarised with featureCounts.
dgeList <- read_tsv("../20170327_Psen2S4Ter_RNASeq/data/2_alignedData/featureCounts/genes.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
set_colnames(str_remove(colnames(.), "_6month_07_07_2016_F3")) %>%
set_colnames(str_remove(colnames(.), "Ps2Ex3M1_")) %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
dgeList$genes <- genesGR_Dr[rownames(dgeList),]
mcols(dgeList$genes) %<>%
as.data.frame() %>%
left_join(geneGcLen_Dr)
dgeList$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(rRnaProp) %>%
column_to_rownames("rowname")
## file paths needed to be changed to the current filepath
## the original filepath (that featureCounts used) is different to the current
dgeList$samples$filenames <- read_tsv(
"../20170327_Psen2S4Ter_RNASeq/data/2_alignedData/featureCounts/genes.out"
) %>%
dplyr::select(str_subset(colnames(.), "Ps2Ex3M1_")) %>%
colnames() %>%
str_replace(., "/2_al", "/data/2_al")
gcInfo <- function(x) {
x$counts %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
as_tibble() %>%
pivot_longer(
cols = colnames(x),
names_to = "sample",
values_to = "counts"
) %>%
dplyr::filter(
counts > 0
) %>%
left_join(
geneGcLen_Dr
) %>%
dplyr::select(
ends_with("id"), sample, counts, aveGc, maxLen
) %>%
split(f = .$sample) %>%
lapply(
function(x){
DataFrame(
gc = Rle(x$aveGc, x$counts),
logLen = Rle(log10(x$maxLen), x$counts)
)
}
)
}
gcSummary <- function(x) {
x %>%
vapply(function(x){
c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
}, numeric(4)
) %>%
t() %>%
set_colnames(
c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
) %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble()
}
rle <- gcInfo(dgeList)
sumGc <- gcSummary(rle)
a <- sumGc %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_logLen)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean log(Length)"
)
b <- sumGc %>%
left_join(rRnaProp) %>%
ggplot(aes(proportion, mn_gc)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean GC Content"
)
ggarrange(a, b, ncol = 2, nrow = 1) %>%
annotate_figure("PsenS4Ter Dataset (D. rerio)")
Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.
genes2keep <- dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(6)
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
pca <- cpm(dgeFilt, log = TRUE) %>%
t() %>%
prcomp()
pcaCor <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
as_tibble() %>%
left_join(rRnaProp) %>%
dplyr::select(
PC1, PC2, PC3,
Mean_GC = mn_gc,
Mean_Length = mn_logLen,
rRna_Proportion = proportion
) %>%
cor()
a <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
ggplot(aes(PC1, PC2)) +
geom_point(size = 2) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = paste0("PC2 (", percent(summary(pca)$importance["Proportion of Variance","PC2"]),")")
)
b <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(rRnaProp) %>%
ggplot(aes(PC1, proportion)) +
geom_point(size = 2) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "rRNA Proportion of Initial Library"
)
c <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_logLen)) +
geom_point(size = 2) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean log(Length)"
)
d <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_gc)) +
geom_point(size = 2) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean GC"
)
ggarrange(a, b, c, d, ncol = 2, nrow = 2) %>%
annotate_figure("Psen2S4Ter")
PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.
corrplot(
pcaCor,
type = "lower",
diag = FALSE,
addCoef.col = 1, addCoefasPercent = TRUE
)
Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.
design <- model.matrix(~proportion, data = dgeFilt$samples)
voom <- voom(dgeFilt, design = design)
fit <- lmFit(voom, design = design)
eBayes <- eBayes(fit)
topTable <- eBayes %>%
topTable(coef = colnames(design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
mutate(DE = adj.P.Val < 0.05) %>%
unite(Location, c(seqnames, start, end, width, strand), sep = ":") %>%
dplyr::select(
Geneid = gene_id,
Symbol = gene_name,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
Location,
t,
DE,
everything(),
-B
) %>%
as_tibble()
topTable %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR, DE) %>%
mutate(
AveExpr = format(round(AveExpr, 2), nsmall = 2),
logFC = format(round(logFC, 2), nsmall = 2),
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR)
) %>%
dplyr::slice(1:200) %>%
datatable(options = list(pageLength = 20), class = "striped hover condensed responsive", filter = "top")
dirs <- list.dirs(
"../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant/",
recursive = FALSE
)
salmon <- catchSalmon(paths = dirs)
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_80_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_81_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_84_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_89_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_78_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_82_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_83_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_85_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_79_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_86_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_88_Fem, 51605 transcripts, 0 bootstraps
## Reading ../20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_95_Fem, 51605 transcripts, 0 bootstraps
dgeListTx <- salmon$counts %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "_6month_07_07_2016_F3")) %>%
set_colnames(str_remove(colnames(.), "Ps2Ex3M1_")) %>%
as.data.frame() %>%
rownames_to_column("tx_id") %>%
mutate(tx_id = str_remove(tx_id, "\\.[0-9]+$")) %>%
column_to_rownames("tx_id") %>%
DGEList() %>%
calcNormFactors()
genes2keep <- dgeListTx %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(6)
dgeFiltTx <- dgeListTx[genes2keep,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
dna <- ensembldb::getGenomeTwoBitFile(ensDb_Dr)
# deIds <- topTable %>%
# dplyr::filter(DE) %>%
# .$Geneid
# notIds <- topTable %>%
# dplyr::filter(!DE) %>%
# .$Geneid
# deExons <- exonsBy(ensDb_Dr, by = "gene") %>%
# .[deIds] %>%
# lapply(function(x){
# GenomicRanges::reduce(x)
# }) %>%
# GRangesList()
# deStrings <- lapply(deExons, function(x){
# getSeq(dna, x) %>%
# unlist()
# }
# )
# deStringSet <- DNAStringSet(deStrings)
# writeXStringSet(
# deStringSet,
# "/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/deSeqs.fa"
# )
# conExons <- exonsBy(ensDb_Dr, by = "gene") %>%
# .[conIds] %>%
# lapply(function(x){
# GenomicRanges::reduce(x)
# }) %>%
# GRangesList()
# conStrings <- lapply(conExons, function(x){
# getSeq(dna, x) %>%
# unlist()
# }
# )
# conStringSet <- DNAStringSet(conStrings)
# writeXStringSet(
# conStringSet,
# "/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/conSeqs.fa"
# )
loadMer <- function(x){
read.table(
paste0(
"/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/",
x,
"_dumps.txt"
),
col.names = c("mer", "count")
) %>%
dplyr::arrange(-count) %>%
as_tibble()
}
merDek5 <- loadMer("dek5")
merDek6 <- loadMer("dek6")
merDek7 <- loadMer("dek7")
merDek8 <- loadMer("dek8")
merDek9 <- loadMer("dek9")
merDek10 <- loadMer("dek10")
merConk5 <- loadMer("conk5")
merConk6 <- loadMer("conk6")
merConk7 <- loadMer("conk7")
merConk8 <- loadMer("conk8")
merConk9 <- loadMer("conk9")
merConk10 <- loadMer("conk10")
x <- merDek10
y <- merConk10
merRatio <- function(x, y) {
join <- x %>%
full_join(y, by = "mer")
join[is.na(join)] <- 0
join %>%
mutate(
p.x = count.x / sum(count.x),
p.y = count.y / sum(count.y),
ratio = p.x / p.y
) %>%
arrange(desc(ratio))
}
merRatk5 <- merRatio(merDek5, merConk5)
merRatk6 <- merRatio(merDek6, merConk6)
merRatk7 <- merRatio(merDek7, merConk7)
merRatk8 <- merRatio(merDek8, merConk8)
merRatk9 <- merRatio(merDek9, merConk9)
merRatk10 <- merRatio(merDek10, merConk10)
plotRatk5 <- merRatk5 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 5") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk6 <- merRatk6 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 6") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk7 <- merRatk7 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 7") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk8 <- merRatk8 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 8") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk9 <- merRatk9 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 9") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk10 <- merRatk10 %>%
ggplot(aes(p.x, p.y)) +
geom_point(size = 1) +
labs(x = "DE", y = "Not DE", title = "k = 10") +
scale_x_sqrt() +
scale_y_sqrt() +
geom_abline(slope = 1, intercept = 0, col = "blue")
plotRat <- ggarrange(plotRatk5, plotRatk6, plotRatk7, plotRatk8,
plotRatk9, plotRatk10, nrow = 3, ncol = 2)
annotate_figure(
plotRat,
bottom = "Square root ratio of DE gene kmer counts vs not DE gene kmer counts"
)
exact.test <- function(w,x,y,z) {
test <- matrix(data = c(w, x, y, z), nrow = 2)
fisher.test(test) %>%
.$p.value
}
sumDek5 <- sum(merRatk5$count.x)
sumConk5 <- sum(merRatk5$count.y)
merPk5 <- merRatk5[1:10,] %>%
mutate(
p.val = future_pmap_dbl(list(count.x, sumDek5, count.y, sumConk5), exact.test),
p.adj = p.adjust(p.val, "bonf")
) %>%
dplyr::arrange(p.val)
sumDek6 <- sum(merRatk6$count.x)
sumConk6 <- sum(merRatk6$count.y)
merPk6 <- merRatk6 %>%
mutate(
p.val = future_pmap_dbl(list(count.x, sumDek6, count.y, sumConk6), exact.test),
p.adj = p.adjust(p.val, "bonf")
) %>%
dplyr::arrange(p.val)
sumDek7 <- sum(merRatk7$count.x)
sumConk7 <- sum(merRatk7$count.y)
merPk7 <- merRatk7 %>%
mutate(
p.val = future_pmap_dbl(list(count.x, sumDek7, count.y, sumConk7), exact.test),
p.adj = p.adjust(p.val, "bonf")
) %>%
dplyr::arrange(p.val)
sumDek8 <- sum(merRatk8$count.x)
sumConk8 <- sum(merRatk8$count.y)
merPk8 <- merRatk8 %>%
mutate(
p.val = future_pmap_dbl(list(count.x, sumDek8, count.y, sumConk8), exact.test),
p.adj = p.adjust(p.val, "bonf")
) %>%
dplyr::arrange(p.val)
sumDek9 <- sum(merRatk9$count.x)
sumConk9 <- sum(merRatk9$count.y)
merPk9 <- merRatk9 %>%
mutate(
p.val = future_pmap_dbl(list(count.x, sumDek9, count.y, sumConk9), exact.test),
p.adj = p.adjust(p.val, "bonf")
) %>%
dplyr::arrange(p.val)
sumDek10 <- sum(merRatk10$count.x)
sumConk10 <- sum(merRatk10$count.y)
merPk10 <- merRatk10 %>%
mutate(
p.val = future_pmap_dbl(list(count.x, sumDek10, count.y, sumConk10), exact.test),
p.adj = p.adjust(p.val, "bonf")
) %>%
dplyr::arrange(p.val)
# nBins <- list(length = 10, gc = 10)
# binTable <- topTable %>%
# dplyr::select(-gene_biotype, entrezid) %>%
# mutate(
# lengthBins = cut(
# log(aveLen),
# breaks = quantile(
# log(aveLen), seq(0, nBins$length)/nBins$length
# ),
# labels = paste0("L", seq_len(nBins$length)),
# include.lowest = TRUE
# ),
# gcBins = cut(
# aveGc,
# breaks = quantile(
# aveGc, seq(0, nBins$gc) / nBins$gc
# ),
# labels = paste0("GC", seq_len(nBins$gc)),
# include.lowest = TRUE
# ),
# bothBins = paste(lengthBins, gcBins, sep = "_"),
# bothBins = as.factor(bothBins)
# ) %>%
# group_by(bothBins) %>%
# mutate(n = n()) %>%
# ungroup() %>%
# dplyr::filter(n > 1) %>%
# as_tibble()
# countDnaKmers(as.character(deStringSet[1], use.names = FALSE), k = 8) %>%
# as.data.frame() %>%
# rownames_to_column("kmer") %>%
# as_tibble() %>%
# set_colnames(c("kmer", "count")) %>%
# dplyr::filter(count > 1)
# temp <- strsplit(as.character(deStringSet[1:100], use.names = FALSE), "")
# kcount(temp, k = 8) %>%
# t() %>%
# rowSums() %>%
# enframe(name = "kmer", value = "count") %>%
# dplyr::arrange(desc(count))
options(ucscChromosomeNames = FALSE)
ah_Dr <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb_Dr <- ah_Dr[["AH74989"]]
exons_Dr <- exonsBy(
ensDb_Dr,
by = "tx",
columns = c("exon_id", "gene_id", "tx_id", "gene_name"),
filter = AnnotationFilterList(
SeqNameFilter(c(1:25)),
GeneIdFilter("ENSD", "startsWith")
)
) %>%
unlist()
colnames(mcols(exons_Dr)) <- str_remove_all(colnames(mcols(exons_Dr)), "_id") %>%
str_replace_all("tx", "transcript")
mcols(exons_Dr) <- mcols(exons_Dr)[c("gene_name", "gene", "exon", "transcript")]
genesFilt_Dr <- genes(
ensDb_Dr,
filter = AnnotationFilterList(
SeqNameFilter(c(1:25)),
GeneIdFilter("ENSD", "startsWith")
)
)
order <- dgeList$samples %>%
dplyr::arrange(filenames) %>%
as_tibble() %>%
rownames_to_column("order") %>%
dplyr::arrange(-proportion)
## Gene region & genome axis tracks
sym <- "rpl29"
goi <- genesFilt_Dr[mcols(genesFilt_Dr)$gene_name == sym,]$gene_id
counts <- dgeList$counts[goi,]
gm <- subset(exons_Dr, gene == goi) %>%
subset(transcript %in% rownames(dgeFiltTx))
gmRed <- gm %>%
GenomicRanges::reduce()
st <- GenomicRanges::start(gmRed) %>%
min()
en <- GenomicRanges::end(gmRed) %>%
max()
chr <- seqnames(gmRed)@values
str <- gmRed %>%
strand() %>%
.@values
gTrack <- GenomeAxisTrack()
rrTrack <- GeneRegionTrack(
gmRed,
name = NULL,
fill = "darkorange"
)
rTrack <- GeneRegionTrack(
gm,
name = NULL
)
## GC content track
win <- slidingWindows(gmRed, width = 1) %>%
unlist()
start <- win %>%
start()
end <- win %>%
end()
seq <- getSeq(dna, win)
gc <- seq %>%
letterFrequency(letters = "GC", OR = "") %>%
.[,1]
tbl <- tibble(
start = start,
end = end,
gc = gc
) %>%
mutate(
seqnames = chr,
strand = str,
cumsum = cumsum(gc),
cummean = cummean(gc),
lag = dplyr::lag(cumsum, n = 50, default = NA),
lead = dplyr::lead(cumsum, n = 50, default = NA),
gcsum = lead - lag,
gcprop = gcsum / 101
)
gcTrack <- tbl %>%
dplyr::select(seqnames, start, end, strand, gcprop) %>%
makeGRangesFromDataFrame(
keep.extra.columns = TRUE
) %>%
DataTrack(
type = c("b"),
ylim = c(0, 1),
cex = 0.2,
name = "GC %",
col.title="black",
fontface.title=2,
fontsize=10,
grid = TRUE,
lwd.grid = 0.5,
lty.grid = "dashed",
v = 0
)
## Coverage tracks
files <- list.files(
"/data/biohub/20170327_Psen2S4Ter_RNASeq/data/2_alignedData/bam",
pattern = ".bam$",
full.names = TRUE
)
param <- ScanBamParam(
what = c("pos", "qwidth"),
which = GRanges(seqnames = chr, ranges = IRanges(st, en), strand = str),
flag=scanBamFlag(isUnmappedQuery=FALSE)
)
covFunc <- function(x){
prop <- order$proportion[order$filenames == x] %>%
percent(accuracy = 0.01)
reads <- counts[order$sample[order$filenames == x]]
pileup(x, scanBamParam = param) %>%
dplyr::filter(strand == str) %>%
mutate(start = pos, end = pos, count = count/reads) %>%
dplyr::select(seqnames, start, end, strand, count) %>%
makeGRangesFromDataFrame(keep.extra.columns = TRUE) %>%
DataTrack(
type = c("heatmap"),
cex = 0.2,
name = prop,
col.title="black",
fontface.title=2,
fontsize=10,
)
}
covList <- mclapply(files, covFunc, mc.cores = 8)
covOrd <- covList[as.integer(order$order)]
## k-mer track
geneMod <- subset(genesGR_Dr, gene_id == goi)
geneSeq <- getSeq(dna, geneMod)
merTop <- dplyr::filter(merPk8, ratio > 1) %>%
.$mer %>%
.[1:50]
merLoc <- vmatchPattern(merTop[25], geneSeq)
merLoc <- lapply(merTop, function(x){
vmatchPattern(x, geneSeq) %>%
unlist()
})
merStart <- rapply(merLoc, function(x){
start(x)
})
merWidth <- rapply(merLoc, function(x){
width(x)
})
hTrack <- HighlightTrack(
trackList = gcTrack,
start = start(geneMod) + merStart,
width = merWidth,
chromosome = chr,
inBackground = TRUE,
col = "white",
fill = "red"
)
## Plot the tracks
plotTracks(
c(gTrack, rrTrack, rTrack, covOrd, hTrack),
from = st - 100,
to = en + 100,
chromosome = chr,
sizes = c(1,0.5,1,rep(0.5, length(covOrd)),1),
main = sym,
cex.main = 1.5,
fontface.main = 1
)